You do not actually have to run this, all of this is just here so you know how to set it up in the future. You can just skip down to ## Map Plotting To see where the coordinates come from see: https://github.com/dy-lin/hs19-trends/blob/master/R/LatLongScript.Rmd
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_logical(),
## X1 = col_double()
## )
## See spec(...) for full column specifications.
*This will show us a map of the whole world, with the countries coloured based on their most common topic of study
pl <- ggplot() +
geom_polygon(data = world_data2, aes(x = long, y = lat, group = group, fill = log(Count),text=Count)) +
#geom_point(data=data,aes(x = long, y = lat),alpha=0.5,size=0.5,colour="grey")+
coord_fixed(1.3)+
scale_fill_viridis()+
theme_void()
## Warning: Ignoring unknown aesthetics: text
ggplotly(pl,tooltip = "text")
## R Markdown
###get only europe
filt_bbox <- sf::st_bbox(c(xmin = -9,
ymin = 36,
xmax = 42.5,
ymax = 70.1),
crs = st_crs(4326)) %>%
sf::st_as_sfc(.)
find_data <- sf::st_within(world_sf, filt_bbox)
## although coordinates are longitude/latitude, st_within assumes that they are planar
#> although coordinates are longitude/latitude, st_within assumes that they are planar
europe_sf <- world_sf[which(lengths(find_data) != 0), ]
europe_result <- st_within(point_sf, europe_sf, sparse = FALSE)
## although coordinates are longitude/latitude, st_within assumes that they are planar
# Calculate the total count of each polygon
# Store the result as a new column "Count" in world_sf
europe_sf <- europe_sf %>%
mutate(Count = apply(europe_result, 2, sum))
# Convert world_sf to a data frame world_df
europe_df <- europe_sf
st_geometry(europe_df) <- NULL
# Get world data frame
world_data <- map_data("world")
# Merge world_data and world_df
europe_data <- europe_df %>%
left_join(world_data, by = c("region"))
ind <- sf::st_intersects(point_sf, europe_sf)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
points_europe<- point_sf[which(lengths(ind) != 0), ]
points_europe <- cbind(points_europe,st_coordinates(points_europe))
points_europe=points_europe[,-c(6,7)]
*Now we are plotting the number of papers that come out of each country in europe and also adding in the locations of the insitutions.
europe_data=read.csv("https://raw.githubusercontent.com/dy-lin/hs19-trends/master/workshop/data/europe_data.csv")
points_europe=read.csv("https://raw.githubusercontent.com/dy-lin/hs19-trends/master/workshop/data/europe_points.csv")
text=paste(europe_data$region,europe_data$Count, sep=";")
pl <- ggplot() +
geom_polygon(data = europe_data, aes(x = long, y = lat, group = group, fill = log(Count))) +
geom_point(data=points_europe,aes(x=X,y=Y,text=str_wrap(affiliation,50)),alpha=0.5,size=0.5,colour="grey")+
coord_fixed(1.3)+
scale_fill_viridis()+
theme_void()
## Warning: Ignoring unknown aesthetics: text
ggplotly(pl,tooltip="text")
Let’s load our packages
library(tidyverse)
library(tidytext)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(widyr)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following object is masked from 'package:rgeos':
##
## union
## The following object is masked from 'package:plotly':
##
## groups
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggplot2)
library(ggraph)
##
## Attaching package: 'ggraph'
## The following object is masked from 'package:sp':
##
## geometry
library(readr)
library(tidygraph)
##
## Attaching package: 'tidygraph'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:stats':
##
## filter
Load in the CSV, see what topic options we have available
df=read.csv("https://raw.githubusercontent.com/dy-lin/hs19-trends/master/workshop/data/bigrams.csv")
print(kable(unique(df$topic)))
##
##
## |x |
## |:---------------------|
## |Assembly |
## |Databases |
## |Epigenetics |
## |Gene Expression |
## |Genome Annotation |
## |Phylogenetics |
## |Sequence Alignment |
## |Sequencing |
## |Structural Prediction |
## |Variant Calling |
*This takes the bigram frequency determination and plotting and wraps it in one function, visualize_bigrams().The plotting starts on line 228
visualize_bigrams <- function(df_name, textfield, topic_title){
# Create frequencies of bigrams
df_cleaned <- df_name %>%
mutate(textfield_clean = removeWords(gsub("[^A-Za-z0-9 ]", "", {{textfield}}), stop_words$word))
df_bigrams <- df_cleaned %>%
unnest_tokens(bigrams, textfield_clean, token = "ngrams", n = 2)
df_freq <- as.data.frame(table(df_bigrams$bigrams)) %>%
arrange(desc(Freq))
# Visualizations
df_top_bigrams <- df_freq %>%
top_n(100, Freq) %>%
separate(Var1, c("word1", "word2"))
top_bigram_words <- c(df_top_bigrams$word1, df_top_bigrams$word2) %>%
unique()
word_list <- df_cleaned %>%
unnest_tokens(words, textfield_clean, token = "ngrams", n = 1)
df_word_list <- as.data.frame(table(word_list$words)) %>%
arrange(desc(Freq)) %>%
filter(Var1 %in% top_bigram_words)
names(df_word_list)[2] <- "Term_Frequency"
names(df_top_bigrams)[3] <- "Edge_Frequency"
graph_from_data_frame(vertices = df_word_list, d = df_top_bigrams) -> graph_hold
pl <- graph_hold %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_colour = log(Edge_Frequency)), show.legend = TRUE) +
geom_node_point(aes(color = Term_Frequency, size = Term_Frequency), alpha = 0.7) +
scale_fill_viridis_c() +
scale_edge_color_viridis(direction=-1)+
geom_node_text(aes(label = name), repel = TRUE) +
scale_color_viridis_c(direction = -1) +
theme_void() +
guides(size=FALSE) +
labs(title = quo_name(topic_title)) +
theme(plot.title = element_text(size = 26, face = "bold"))
ggsave(pl,filename = paste0("../figures/", "bigrams_", str_to_lower(str_replace(topic_title, "\\s", "_")), ".png")
,width = 12
,height = 8)
pl
}
Now that we have the function made, decide on a topic and make a bigram digram for those topics
df_assembly <- df %>%
filter(topic == "Assembly")
visualize_bigrams(df_assembly, abstract, "")
## Warning: Factor `journal` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `affiliation` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `abstract` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `journal` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `affiliation` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `abstract` contains implicit NA, consider using
## `forcats::fct_explicit_na`